#Sidak Yntiso
#Extract WA data
#April 22 2020
#1. Read each page into RDA file
#2. Combine RDA files
rm(list=ls())
library(pdftools)
library(dplyr)
library(rvest)
library(RSelenium)
library(readr)
library(stringi)

#################################
#1. Read each page into RDA file#
#################################

#read in file provided by Washington State Department of Corrections
dir <- ""
WA_file = paste(dir,"\\P-8928_Yntiso_Final.pdf",sep="")
file1 = pdf_text(WA_file)

#remove first page
file1 <- file1[-1]

#fix
file1[[1]] <- gsub("1 OF 5\r\nSource: DOC OMNI Tables as of 9/30/2019\r\nProduced by: Research Data Analytics Unit 10/23/2019\r\nThe following data represent sentence information for all offenders who were under DOC jurisdiction sometime between 1/1/2015 and 12/31/2018. The sentences are only those that also fell under\r\nthe jurisdiction of the DOC at any time, whether sentence occurred before, during or after the specific time frame requested. The data reflects the original sentence and not any subsequent changes\r\nto the sentence that could occur due to a revocation. The data does not include any sentences that were Expunged or Vacated.\r\nPlease refer to DICTIONARY sheet for descriptions of each field listed.\r\n",
                   "",file1[[1]])

#final_dat <- data.frame()
reduce_dat= data.frame()
#reach each page of the PDF at a time.
for (i in c(1:length(file1))){
  #each page
  page1 = file1[i] %>% strsplit(split = "\r\n")
  
  for (j in c(1:length(page1[[1]]))){
    #check if QUANTITY or LIFE in string
    #strsplit by STAT MAX TYPE (either quantity or life)
    line1 = page1[[1]][j] 
    dat <- data.frame(page = i,line = j,contents = line1)
    reduce_dat <- plyr::rbind.fill(reduce_dat,dat)
  }
  print(i)
  if( (i %% 100) ==1){
    save(reduce_dat, file = paste(dir,"\\",i,"_WA_file_raw_apr2020.RDA",sep=""))
    reduce_dat= data.frame()
  }
}

######################
#2. Combine RDA files#
######################

rm(list=ls())
#read in file

load( paste(dir,"\\","1_WA_file_raw_apr2020.RDA",sep=""))
final_dat = reduce_dat
for (i in c(seq(101,3301,100))){
  load( paste(dir,"\\",i,"_WA_file_raw_apr2020.RDA",sep=""))
  final_dat = plyr::rbind.fill(reduce_dat,final_dat)
  rm(reduce_dat)
}
reduce_dat = final_dat
rm(final_dat)
reduce_dat <- reduce_dat[!duplicated(reduce_dat),]
save(reduce_dat, file = paste(dir,"\\WA_file_raw_apr2020.RDA",sep=""))
